{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "13f51a4b",
   "metadata": {},
   "source": [
    "# 使用Bert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0771052",
   "metadata": {},
   "outputs": [
    {
     "ename": "OSError",
     "evalue": "Can't load tokenizer for 'google-bert/bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'google-bert/bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)\n",
      "Cell \u001b[0;32mIn[3], line 5\u001b[0m\n",
      "\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
      "\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertModel\n",
      "\u001b[0;32m----> 5\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mBertTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgoogle-bert/bert-base-uncased\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;32m      6\u001b[0m model \u001b[38;5;241m=\u001b[39m BertModel\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgoogle-bert/bert-base-uncased\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_bert_embedding\u001b[39m(text):\n",
      "\n",
      "File \u001b[0;32m~/workspace/jupyter/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:2046\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)\u001b[0m\n",
      "\u001b[1;32m   2043\u001b[0m \u001b[38;5;66;03m# If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be\u001b[39;00m\n",
      "\u001b[1;32m   2044\u001b[0m \u001b[38;5;66;03m# loaded directly from the GGUF file.\u001b[39;00m\n",
      "\u001b[1;32m   2045\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mall\u001b[39m(full_file_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m full_file_name \u001b[38;5;129;01min\u001b[39;00m resolved_vocab_files\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m gguf_file:\n",
      "\u001b[0;32m-> 2046\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n",
      "\u001b[1;32m   2047\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt load tokenizer for \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpretrained_model_name_or_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. If you were trying to load it from \u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
      "\u001b[1;32m   2048\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps://huggingface.co/models\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, make sure you don\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt have a local directory with the same name. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
      "\u001b[1;32m   2049\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOtherwise, make sure \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpretrained_model_name_or_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m is the correct path to a directory \u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
      "\u001b[1;32m   2050\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontaining all relevant files for a \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m tokenizer.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
      "\u001b[1;32m   2051\u001b[0m     )\n",
      "\u001b[1;32m   2053\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file_id, file_path \u001b[38;5;129;01min\u001b[39;00m vocab_files\u001b[38;5;241m.\u001b[39mitems():\n",
      "\u001b[1;32m   2054\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m file_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m resolved_vocab_files:\n",
      "\n",
      "\u001b[0;31mOSError\u001b[0m: Can't load tokenizer for 'google-bert/bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'google-bert/bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer."
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "from transformers import BertTokenizer, BertModel\n",
    "\n",
    "tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')\n",
    "model = BertModel.from_pretrained('google-bert/bert-base-uncased')\n",
    "\n",
    "def get_bert_embedding(text):\n",
    "    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')\n",
    "    with torch.no_grad():\n",
    "        outputs = model(**inputs)\n",
    "    return outputs.last_hidden_state[:, :3, :].numpy()\n",
    "\n",
    "text = \"This is an apple\"\n",
    "get_bert_embedding(text)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
